########### Algorithmic representation: Study 3 online survey experiment ######
# Code for the replication of the analyses reported in the paper:
# Finding the white male: The prevalence and consequences of 
#algorithmic gender and race bias in political Google searches

# doi: XXXXXXXXXXXXXX

# Contact corresponding author for help/information with code problems


#  OVERVIEW OF CODE SECTIONS----------------------------------------------------

# 0: Data preparation
# H3: Tests of baseline bias hypothesis
# H4: Tests of distribution bias hypothesis


#if using r studio, display content outline with Ctrl + Shift + O


# 0 - Data preparation ----------------------
#libraries
library(tidyverse) # for tidy data manipulations
library(lme4) # For multilevel models
library(lmerTest) # for multilevel models
library(marginaleffects) # extracting results from multilevel models
library(sjPlot) # exporting regression results
library(sjmisc) #exporting regresion resuls
library(mediation) # for mediation analysis (H4)
library(emmeans)
library(rstatix)
library(broom)
library(officer)
library(tidybayes) # for visualizations
library(viridis)
library(gghighlight)
library(Hmisc)
library(cowplot)
library(lemon)


#get local path to the folder containing the data files (within the folder where replication files are stored)
path_to_folder <- "" 
setwd(path_to_folder) #set working directory to that folder


#read_data and define data types
data_usa <- read.csv("data/vpn_data_study3_USA.csv",
                 na.strings = c("", "NA", "N/A")) %>%
  # tibble() %>%
  slice(3:n()) %>%
  #redefining data types
  mutate_if(is.character,
            ~ ifelse(grepl("^-?\\d+(\\.\\d+)?$", .), as.numeric(.), .))%>%
  mutate(
    sample = as.factor(rep("USA", n())), #variable encoding US sample
    #some renaming of variables    
    age = age_1,
    se_use = Q41_1,
    pre_representation = pre_representation_2,
    post_representation = post_representation_2,
    condition = factor(FL_18_DO, c("treatment_balanced","treatment_biased"), c("balanced","biased")),
    party_vote = factor(party_vote, 1:3, c("conservative","liberal","other")),
    gender = factor(gender, 1:4, c("male","female","other","other")))


data_uk <- read.csv("data/vpn_data_study3_UK.csv",
                     na.strings = c("", "NA", "N/A")) %>%
  # tibble() %>%
  slice(3:n()) %>%
  #  #redefining data types
  mutate_if(is.character,
            ~ ifelse(grepl("^-?\\d+(\\.\\d+)?$", .), as.numeric(.), .))%>%
  mutate(sample = as.factor(rep("UK", n())), #UK dummy
   #renaming varibales
          viability_1 = as.numeric(viability_1),
          viability_2 = as.numeric(viability_2),
         age = age_1,
    se_use = Q41_1,
         pre_representation = pre_representation_2,
         post_representation = post_representation_2,
         condition = factor(FL_18_DO, c("treatment_balanced","treatment_biased"), c("balanced","biased")),
         party_vote = factor(party_vote, 1:3, c("conservative","liberal","other")),
         gender = factor(gender, 1:4, c("male","female","other","other")))

#create selectors
communality_vars <- c("trait_friendly","trait_cooperative","trait_goodnatured","trait_sincere")
agency_vars <-  c("trait_competent","trait_confident","trait_rational","trait_ambitious")
fem_issues_vars <- c("issue_healthcare", "issue_education","issue_welfare")
masc_issues_vars <- c("issue_military", "issue_security","issue_economy")
viability_vars <- c("viability_1", "viability_2", "viability_3")
pre_efficacy <- c("pre_efficacy1", "pre_efficacy2")
post_efficacy <- c("post_efficacy1", "post_efficacy2")
ideology_vars <- c("ideology_1_1", "ideology_2_1")
pre_efficacy <- c("pre_efficacy1", "pre_efficacy2")
post_efficacy <- c("post_efficacy1", "post_efficacy2")
ideology_vars <- c("ideology_1_1", "ideology_2_1")


#create scales
data_usa <- data_usa%>%
  mutate(
    viability_3 = as.numeric(viability_3),
    communality = rowMeans(across(all_of(communality_vars)), na.rm = TRUE),
    agency = rowMeans(across(all_of(agency_vars)), na.rm = TRUE),
    fem_issues = rowMeans(across(all_of(fem_issues_vars)), na.rm = TRUE),
    masc_issues = rowMeans(across(all_of(masc_issues_vars)), na.rm = TRUE),
    viability = rowMeans(across(all_of(viability_vars)), na.rm = TRUE),
    pre_efficacy = rowMeans(across(all_of(pre_efficacy)), na.rm = TRUE),
    ideology = rowMeans(across(all_of(ideology_vars)), na.rm = TRUE),
    post_efficacy = rowMeans(across(all_of(post_efficacy)), na.rm = TRUE),
    diff_rep = post_representation - pre_representation,
    #rescale variables
    viability = scale(viability),
    communality = (communality - min(communality)) / (max(communality) - min(communality)),
    agency = (agency - min(agency)) / (max(agency) - min(agency)),
    fem_issues = (fem_issues - min(fem_issues)) / (max(fem_issues) - min(fem_issues)),
    masc_issues = (masc_issues - min(masc_issues)) / (max(masc_issues) - min(masc_issues)))



data_uk <- data_uk %>%
  mutate(
    viability_3 = as.numeric(viability_3),
    communality = rowMeans(across(all_of(communality_vars)), na.rm = TRUE),
    agency = rowMeans(across(all_of(agency_vars)), na.rm = TRUE),
    fem_issues = rowMeans(across(all_of(fem_issues_vars)), na.rm = TRUE),
    masc_issues = rowMeans(across(all_of(masc_issues_vars)), na.rm = TRUE),
    viability = rowMeans(across(all_of(viability_vars)), na.rm = TRUE),
    pre_efficacy = rowMeans(across(all_of(pre_efficacy)), na.rm = TRUE),
    ideology = rowMeans(across(all_of(ideology_vars)), na.rm = TRUE),
    post_efficacy = rowMeans(across(all_of(post_efficacy)), na.rm = TRUE),
    diff_rep = post_representation - pre_representation,
    #rescale variables
    viability = scale(viability),
    communality = (communality - min(communality)) / (max(communality) - min(communality)),
    agency = (agency - min(agency)) / (max(agency) - min(agency)),
    fem_issues = (fem_issues - min(fem_issues)) / (max(fem_issues) - min(fem_issues)),
    masc_issues = (masc_issues - min(masc_issues)) / (max(masc_issues) - min(masc_issues)))

#append data sets into one file
data <- bind_rows(data_usa, data_uk)


#create a long data frame for models with pre-/post within-subject effects
data_long <- data %>%
  pivot_longer(cols = starts_with(c("pre_", "post_")),
               names_to = c("time", ".value"),
               names_sep = "_")

#create long country subsets
d_long_uk <- data_long %>% filter(sample == "UK")
d_long_us <- data_long %>% filter(sample == "USA")


# H3: Perceptual Bias ------------------------------------------------
# We test the perceiptual bias hypothesis by running mixed effects models
# clustered around the individual participants. The hypothesis test focuses on
# the interaction term between time and condition.
# We additionally report marginal means

## UK data -------------------
m_representation_uk <- lmer(data = d_long_uk,
                         representation ~ 1 + condition + time +
                           condition*time +
                           gender + ideology + age + se_use + party_vote +
                           (1 | ResponseId)) 

## US data -----------------
m_representation_us <- lmer(data = d_long_us,
                            representation ~ 1 + condition + time +
                              condition*time +
                              gender + ideology + age + se_use + party_vote +
                              (1 | ResponseId)) 


## export regression models ---------
# These are the models that is reported in the main manuscript;
# It reproduces Table C3.1.1

tab_model(m_representation_us, m_representation_uk,
          file = "outputs/Study3_TableC3.1.1_MainModelsH3.doc")


## get contrasts/marginal means
emm_us <- emmeans(m_representation_us, "condition", by = "time")
contrast(emm_us)
tidy(contrast(emm_us), conf.int = TRUE)

emm_uk <- emmeans(m_representation_uk, "condition", by = "time")
contrast(emm_uk)
tidy(contrast(emm_uk), conf.int = TRUE)

# Conduct the t-test and calculate means, SDs, and Hedges' g
# The result of this is reported in the main manuscript in the results section
# for study 3

t_test_results <- data %>%
  mutate(
    perceptual_bias = case_when(
      sample == "USA" ~ post_representation - 28.7,
      sample == "UK" ~ post_representation - 34.6)) %>%
  t_test(perceptual_bias ~ condition) 


data %>%
  mutate(
    perceptual_bias = case_when(
      sample == "USA" ~ post_representation - 28.7,
      sample == "UK" ~ post_representation - 34.6))%>%
  group_by(condition) %>%
  summarise(
    mean = mean(perceptual_bias, na.rm = TRUE),
    sd = sd(perceptual_bias, na.rm =TRUE))


hedges_g_result <- data %>%
  mutate(
    perceptual_bias = case_when(
      sample == "USA" ~ post_representation - 28.7,
      sample == "UK" ~ post_representation - 34.6))%>%
  cohens_d(perceptual_bias ~ condition,
                            hedges.correction = TRUE)

# Visualization: Figure 3 ---------------------------




# Panel A: bar plot
data_hline <- data.frame(sample = unique(data_long$sample),  # Create data for lines
                         hline = c(28.7, 34.6), #Desc rep from lower chambers in US/UK
                         x = c(1,1))

bar_plot <- data_long %>%
  mutate(
    time = fct_relevel(time, "pre")
  ) %>%
  ggplot(
    aes(x = time, y = representation, group = condition, fill = condition)) +
  stat_summary(geom = "bar", fun.y = "mean",width = .75,
               alpha = .6,
               position = position_dodge(.8)) +
  stat_summary(geom = "errorbar", fun.data = "mean_se", width = .1, position = position_dodge(.8)) +
  scale_fill_manual(values = c("#35b779","#3b528b"))+
  facet_wrap(~ sample) +
  geom_hline(data = data_hline,
             aes(yintercept = hline), linetype = "dashed")+
  theme_classic() +
  coord_flex_cart(left = capped_vertical("both")) +
  labs(y = "Estimted share of women in parliament", x = "Time and condition",
       title = "", fill = "")+
  scale_y_continuous(labels = scales::percent_format(
    accuracy = 1L, scale = 1),
    breaks = seq(0,35,5)) +
  theme(
    strip.text.x = element_text(
      size = 12, face = "bold"),
    strip.background = element_blank())+
  guides(fill = "none")



# Panel B: Distributions of difference in representatin by condition

diff_rep <- data%>%
  mutate(
    perceptual_bias = case_when(
      sample == "USA" ~ post_representation-28.7,
      sample == "UK" ~post_representation - 34.6))%>%
  ggplot(
    aes(y = perceptual_bias, x = condition, 
        color = condition,
        fill = condition))+
  geom_hline(yintercept=0, linetype="dashed")+
  ggdist::stat_halfeye(
    adjust = .5,
    width = .6, alpha = .6,
    justification = -.2)  + 
  geom_point(
    ## draw horizontal lines instead of points
    shape = 95,
    size = 12,
    alpha = .5,
    color = "lightgrey"
  ) +
  ggdist::stat_pointinterval(
    adjust = .5,
    point_interval = "mean_qi",
    color = "black",
    justification = -.2)+
  theme_classic() +
  coord_flex_cart(left = capped_vertical("both")) +
  scale_y_continuous(labels = scales::percent_format(
    accuracy = 1L, scale = 1),
    breaks = seq(-40,40,10),
    limits = c(-40,40)) +
  scale_fill_manual(values = c("#35b779","#3b528b"))+ #"#3b528b" ##440154"
  scale_color_manual(values = c("#35b779","#3b528b"))+
  theme(legend.position = c(.85,.85),
        strip.text.x = element_text(
          size = 12, face = "bold"),
        strip.background = element_blank())+
  labs(
    x = "Condition", color = "", fill = "",
    y = expression("Net perceptual bias"~(Delta~"estimated - actual representation")))+
  guides(shape = "none") 


Figure3 <- cowplot::plot_grid(bar_plot, diff_rep,
                              nrow =1,
                              labels = "AUTO",
                              rel_widths =  c(1.3, 1))

ggsave(Figure3, filename = "outputs/Study3_Figure3_PerceptualBias.jpeg",
       width = 7.5, height = 4.5, dpi = 800)



# H4: Strategic Bias ---------------------------------------------------------------
# We test our last hypotheses by running separate mediation models    
# for electability (H4a) and external efficacy (H4b)


## H4a -  electability -------------
# Note that viability_1 is the name for the electability measure
# (slider scale)
# we pool the country data to achieve higher statistical power;
#country dummy is used as a statistical control


data <- data %>%drop_na(viability_1) %>%
  # calculate the perceptual bias as the difference between
  # misperceived represenstation adjusted by actual descriptive representation
  mutate(
    perceptual_bias = case_when( #calculate mediator
      sample == "USA" ~ post_representation-28.7, # adjust for country_level  desc rep
      sample == "UK" ~post_representation - 34.6)) %>% # adjust for country-level  descr rep
  filter(gender != "other") # remove 8 cases where gender is "other"

set.seed(666) #set seed for reproducibility of bootsrapping




#mediator model
med.fit_electability <- lm(perceptual_bias ~ condition + age  + gender + ideology + party_vote +se_use + sample,
              data = data )

#outcome model
out.fit_electability <- lm(viability_1 ~ condition + age  + gender + ideology + perceptual_bias +party_vote + se_use + sample,
              data =  data)

#mediation analysis
med.out_electability <- mediate(med.fit_electability,
                                out.fit_electability,
                                treat = "condition",
                                mediator = "perceptual_bias",
                   sims = 10000, boot = TRUE,
                   boot.ci.type = "bca")

#get ACME
summary(med.out_electability)

## H4b -  external efficacy -------------
med.fit_efficacy <- lm(perceptual_bias ~ condition + age  + gender + ideology + party_vote +se_use + sample,
                           data = data )
out.fit_efficacy <- lm(post_efficacy ~ condition + age  + gender + ideology + perceptual_bias +party_vote + se_use + sample,
                           data =  data)


med.out_efficacy <- mediate(med.fit_efficacy,
                                out.fit_efficacy,
                                treat = "condition", mediator = "perceptual_bias",
                                sims = 10000, boot = TRUE,
                                boot.ci.type = "bca")
#get ACME
summary(med.out_efficacy)



# get results for upper table of C3.1.2 in appendix
tab_model(
  med.fit_efficacy,
  out.fit_electability,
  out.fit_efficacy,
  file = "outputs/Study3_TableC3.1.2_MediationResults_UpperPart.doc"
)

# get results for bottom table of C3.1.2 in appendix

# Step 1: Tidy the mediation results
med_electability_results <- broom::tidy(med.out_electability)
med_efficacy_results <- broom::tidy(med.out_efficacy)

# Step 2: Combine results into a single data frame
results <- bind_rows(
  mutate(med_electability_results, model = "Electability"),
  mutate(med_efficacy_results, model = "Efficacy")
)

# Step 3: Create a Word document
doc <- read_docx() %>%
  body_add_par("Mediation Analysis Results", style = "heading 1") %>%
  body_add_table(value = results, style = "table_template")  # Adjust the table style as necessary

# Step 4: Save the Word document
print(doc, target = "outputs/Study3_TableC3.1.2_ResultsMediationAnalysis_BottomPart.docx")





# Visualization: Figure 4 ---------------------------
# Note: we used the output from the models in code section
# H4a and H4b to manually draw a path diagramm in powerpoint to create Figure 3



# Additional analyses (pre-registered hypotheses but not included in main manuscript) ---------------------------
# These are reported in Appendix section C3.2



communality_uk <- lm(communality ~ condition +
                       gender + ideology + age + se_use + party_vote,
                     data = data_uk)

communality_us <- lm(communality ~ condition +
                       gender + ideology + age + se_use + party_vote,
                     data = data_usa)

agency_uk <- lm(agency ~ condition +
                  gender + ideology + age + se_use + party_vote,
                data = data_uk)

agency_us <- lm(agency ~ condition +
                  gender + ideology + age + se_use + party_vote,
                data = data_usa)

tab_model(communality_us, agency_us,
          communality_uk, agency_uk,
          show.se = TRUE, show.ci=FALSE,
          file = "outputs/Study3_Table3.2.1_TraitModels.doc")


masc_uk <- lm(masc_issues ~ condition +
                gender + ideology + age + se_use + party_vote,
              data = data_uk)

masc_us <- lm(masc_issues ~ condition +
                gender + ideology + age + se_use + party_vote,
              data = data_usa)

fem_uk <- lm(fem_issues ~ condition +
               gender + ideology + age + se_use + party_vote,
             data = data_uk)

fem_us <- lm(fem_issues ~ condition +
               gender + ideology + age + se_use + party_vote,
             data = data_usa)

tab_model(masc_us, fem_us,
          masc_uk, fem_uk,
          show.se = TRUE, show.ci=FALSE,
          file = "outputs/Study3_TableC3.2.2_IssueModels.doc")



via_uk <- lm(viability ~ condition +
               gender + ideology + age + se_use + party_vote,
             data = data_uk)

via_us <- lm(viability ~ condition +
               gender + ideology + age + se_use + party_vote,
             data = data_usa)

eff_uk <- lm(post_efficacy ~ condition +
               gender + ideology + age + se_use + party_vote,
             data = data_uk)

eff_us <- lm(post_efficacy ~ condition +
               gender + ideology + age + se_use + party_vote,
             data = data_usa)


tab_model(via_us, eff_us,
          via_uk, eff_uk,
          show.se = TRUE, show.ci=FALSE,
          file = "outputs/Study3_Table3.2.3_ViabilityEfficacyModels.doc")


## Rerun main models without the non-pregistered covariate (search engine use) -------------------
m_representation_uk_nose <- lmer(data = d_long_uk,
                            representation ~ 1 + condition + time +
                              condition*time +
                              gender + ideology + age  + party_vote +
                              (1 | ResponseId)) 

m_representation_us_nose <- lmer(data = d_long_us,
                            representation ~ 1 + condition + time +
                              condition*time +
                              gender + ideology + age  + party_vote +
                              (1 | ResponseId)) 

# It reproduces Table C3.2.4
tab_model(m_representation_us, m_representation_uk,
          file = "outputs/Study3_TableC3.2.4_MainModelsNoSEUse.doc")



sessionInfo()
